In [1]:
import graphlab
In [3]:
people = graphlab.SFrame('people_wiki.gl/')
Data contains: link to wikipedia article, name of person, text of article.
In [11]:
people.head()
Out[11]:
In [5]:
len(people)
Out[5]:
In [5]:
obama = people[people['name'] == 'Barack Obama']
In [6]:
obama
Out[6]:
In [8]:
obama['text']
Out[8]:
In [9]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']
Out[9]:
In [12]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
In [11]:
print obama['word_count']
In [16]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
In [14]:
obama_word_count_table.head()
Out[14]:
In [17]:
obama_word_count_table.sort('count',ascending=False)
Out[17]:
Most common words include uninformative words like "the", "in", "and",...
In [18]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()
Out[18]:
In [20]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
tfidf
Out[20]:
In [22]:
people['tfidf'] = tfidf['docs']
people.head()
Out[22]:
In [23]:
obama = people[people['name'] == 'Barack Obama']
In [24]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[24]:
Words with highest TF-IDF are much more informative.
In [25]:
clinton = people[people['name'] == 'Bill Clinton']
In [26]:
beckham = people[people['name'] == 'David Beckham']
We will use cosine distance, which is given by
(1-cosine_similarity)
and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.
(Tip: lower number means closer distance and thereform higher similarity.)
In [27]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
Out[27]:
In [28]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
Out[28]:
In [29]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')
In [30]:
knn_model.query(obama)
Out[30]:
As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.
In [31]:
swift = people[people['name'] == 'Taylor Swift']
In [32]:
knn_model.query(swift)
Out[32]:
In [33]:
jolie = people[people['name'] == 'Angelina Jolie']
In [34]:
knn_model.query(jolie)
Out[34]:
In [32]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
In [33]:
knn_model.query(arnold)
Out[33]:
In [43]:
elton = people[people['name'] == 'Elton John']
Out[43]:
In [46]:
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name = ['word','count']).sort('count',ascending=False)
In [47]:
elton_word_count_table.head()
Out[47]:
In [49]:
elton[['tfidf']].stack('tfidf', new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[49]:
In [ ]:
In [50]:
victoria = people[people['name'] == 'Victoria Beckham']
mccartney = people[people['name'] == 'Paul McCartney']
In [52]:
graphlab.distances.cosine(elton['tfidf'][0],victoria['tfidf'][0])
Out[52]:
In [53]:
graphlab.distances.cosine(elton['tfidf'][0],mccartney['tfidf'][0])
Out[53]:
In [55]:
nn_model_wc = graphlab.nearest_neighbors.create(people, distance='cosine',features=['word_count'],label='name')
In [56]:
nn_model_tfidf = graphlab.nearest_neighbors.create(people, distance='cosine',features=['tfidf'],label='name')
In [59]:
nn_model_wc.query(elton)
Out[59]:
In [60]:
nn_model_tfidf.query(elton)
Out[60]:
In [61]:
nn_model_wc.query(victoria)
Out[61]:
In [62]:
nn_model_tfidf.query(victoria)
Out[62]:
In [ ]: